require(gplots)
require(quanteda)
require(LSX)

## training

dict <- dictionary(file = "temporality.yml")
corp <- readRDS("usecase/data_2021_latin.rds") %>% 
    corpus()
toks <- tokens(corp, remove_url = TRUE, remove_punct = TRUE) %>% 
        tokens_select(min_nchar = 2)
dfmt <- dfm(toks) %>% 
    dfm_trim(min_termfreq = 2)

dfmt_future <- dfm_select(dfmt, dict$de$future["main"])
dfmt_future <- dfmt_future * as.integer(rowSums(dfm_select(dfmt, dict$de$future["aux"])) > 0)
colnames(dfmt_future) <- paste0(colnames(dfmt_future), "/future")

dfmt_perfect <- dfm_select(dfmt, dict$de$perfect["main"])
dfmt_perfect <- dfmt_perfect * as.integer(rowSums(dfm_select(dfmt, dict$de$perfect["aux"])) > 0)
colnames(dfmt_perfect) <- paste0(colnames(dfmt_perfect), "/past")

dfmt2 <- dfm_remove(dfmt, dict$de)
dfmt3 <- cbind(dfmt2, dfmt_future, dfmt_perfect)
seed <- c("*/future" = 1, "*/past" = -1)
lss <- textmodel_lss(dfmt3, seed, cache = TRUE, k = 100)
saveRDS(lss, "lss_usecase.rds") 

## prediction

dat_coded <- read.csv("usecase/trainign_tw_issues.csv", fileEncoding = "utf-8")
dat_coded[1] <- NULL
issue <- c("Domestic Security", "Economy", "Education", "Environment", "Campaign Event",
           "Foreign Policy", "Healthcare", "Immigration", "Infrastructure", "Housing", 
           "Other Policy", "Political Entity", "Other", "Social Welfare", "Wages")
dat_coded$class1 <- factor(dat_coded$class1, labels = issue)
corp_coded <- corpus(dat_coded)
toks_coded <- tokens(corp_coded, remove_url = TRUE, remove_punct = TRUE)

dfmt_coded <- dfm(toks_coded)
dat <- docvars(dfmt_coded)
dat$lss <- predict(lss, dfmt_coded, min_n = 10)
dat$lss_bin <- cut(dat$lss, c(-Inf, -3:3, Inf))

saveRDS(dat, "data_usecase.rds")
